{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  第8章 自然言語処理による感情分析（BERT）\n",
    "\n",
    "- 本ファイルでは、第8章で使用するフォルダの作成とファイルのダウンロードを行います。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import urllib.request\n",
    "import zipfile\n",
    "import tarfile\n",
    "import glob\n",
    "import io"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# フォルダ「data」が存在しない場合は作成する\n",
    "data_dir = \"./data/\"\n",
    "if not os.path.exists(data_dir):\n",
    "    os.mkdir(data_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# フォルダ「vocab」が存在しない場合は作成する\n",
    "vocab_dir = \"./vocab/\"\n",
    "if not os.path.exists(vocab_dir):\n",
    "    os.mkdir(vocab_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# フォルダ「weights」が存在しない場合は作成する\n",
    "weights_dir = \"./weights/\"\n",
    "if not os.path.exists(weights_dir):\n",
    "    os.mkdir(weights_dir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./vocab/bert-base-uncased-vocab.txt',\n",
       " <http.client.HTTPMessage at 0x7fc9ed54d780>)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 単語集：ボキャブラリーをダウンロード\n",
    "\n",
    "# 'bert-base-uncased': \n",
    "# https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\n",
    "\n",
    "save_path=\"./vocab/bert-base-uncased-vocab.txt\"\n",
    "url = \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt\"\n",
    "urllib.request.urlretrieve(url, save_path)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# BERTの学習済みモデル 'bert-base-uncased'\n",
    "# https://github.com/huggingface/pytorch-pretrained-BERT/\n",
    "# https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz\n",
    "\n",
    "# ダウンロード\n",
    "save_path = \"./weights/bert-base-uncased.tar.gz\"\n",
    "url = \"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz\"\n",
    "urllib.request.urlretrieve(url, save_path)\n",
    "\n",
    "# 解凍\n",
    "archive_file = \"./weights/bert-base-uncased.tar.gz\"  # Uncasedは小文字化モードという意味です\n",
    "tar = tarfile.open(archive_file, 'r:gz')\n",
    "tar.extractall('./weights/')  # 解凍\n",
    "tar.close()  # ファイルをクローズ\n",
    "\n",
    "# フォルダ「weights」に「pytorch_model.bin」と「bert_config.json」ができます"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IMDbデータセットをダウンロードし、tsvファイルに整形\n",
    "\n",
    "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# IMDbデータセットをダウンロード。30秒ほどでダウンロードできます\n",
    "target_dir_path=\"./data/\"\n",
    "\n",
    "if not os.path.exists(target_dir_path):\n",
    "    os.mkdir(target_dir_path)\n",
    "    \n",
    "url = \"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\"\n",
    "save_path = \"./data/aclImdb_v1.tar.gz\"\n",
    "urllib.request.urlretrieve(url, save_path)\n",
    "\n",
    "# './data/aclImdb_v1.tar.gz'の解凍　1分ほどかかります\n",
    "\n",
    "# tarファイルを読み込み\n",
    "tar = tarfile.open('./data/aclImdb_v1.tar.gz')\n",
    "tar.extractall('./data/')  # 解凍\n",
    "tar.close()  # ファイルをクローズ\n",
    "\n",
    "# フォルダ「data」内にフォルダ「aclImdb」というものができます。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# IMDbの個別ファイルをtsvにまとめる\n",
    "target_dir_path=\"./data/aclImdb/\"\n",
    "\n",
    "if os.path.exists(target_dir_path):\n",
    "    \n",
    "    # 訓練データの作成\n",
    "\n",
    "    f=open('./data/IMDb_train.tsv','w')\n",
    "\n",
    "    path = './data/aclImdb/train/pos/'\n",
    "    for fname in glob.glob(os.path.join(path,'*.txt')):\n",
    "        with io.open(fname, 'r', encoding=\"utf-8\") as ff:\n",
    "            text = ff.readline()\n",
    "            \n",
    "            # タブがあれば消しておきます\n",
    "            text = text.replace('\\t', \" \")\n",
    "            \n",
    "            text = text+'\\t'+'1'+'\\t'+'\\n'\n",
    "            f.write(text)\n",
    "\n",
    "    path = './data/aclImdb/train/neg/'\n",
    "    for fname in glob.glob(os.path.join(path,'*.txt')):\n",
    "        with io.open(fname, 'r', encoding=\"utf-8\") as ff:\n",
    "            text = ff.readline()\n",
    "            \n",
    "            # タブがあれば消しておきます\n",
    "            text = text.replace('\\t', \" \")\n",
    "            \n",
    "            text = text+'\\t'+'0'+'\\t'+'\\n'\n",
    "            f.write(text)\n",
    "\n",
    "    f.close()\n",
    "    \n",
    "    \n",
    "    # テストデータの作成\n",
    "    f=open('./data/IMDb_test.tsv','w')\n",
    "\n",
    "    path = './data/aclImdb/test/pos/'\n",
    "    for fname in glob.glob(os.path.join(path,'*.txt')):\n",
    "        with io.open(fname, 'r', encoding=\"utf-8\") as ff:\n",
    "            text = ff.readline()\n",
    "            \n",
    "            # タブがあれば消しておきます\n",
    "            text = text.replace('\\t', \" \")\n",
    "        \n",
    "            text = text+'\\t'+'1'+'\\t'+'\\n'\n",
    "            f.write(text)\n",
    "\n",
    "    path = './data/aclImdb/test/neg/'\n",
    "\n",
    "    for fname in glob.glob(os.path.join(path,'*.txt')):\n",
    "        with io.open(fname, 'r', encoding=\"utf-8\") as ff:\n",
    "            text = ff.readline()\n",
    "            \n",
    "            \n",
    "            # タブがあれば消しておきます\n",
    "            text = text.replace('\\t', \" \")\n",
    "            \n",
    "            text = text+'\\t'+'0'+'\\t'+'\\n'\n",
    "            f.write(text)\n",
    "\n",
    "\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "以上"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
